In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import pickle
from matplotlib import pyplot as plt
import seaborn as sns
# import plotly.plotly as py
# import plotly.offline as pyoff
import plotly.graph_objs as go

import plotly.express as px
import plotly.io as pio
from sklearn.preprocessing import LabelEncoder, StandardScaler

#Settings
%matplotlib inline
pd.options.display.max_columns = None
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)
from IPython.core.interactiveshell import InteractiveShell  
InteractiveShell.ast_node_interactivity = "all"
In [3]:
raw = pd.read_csv('../data/raw/bank-full.csv', sep = ";")
raw.head(3)
Out[3]:
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y
0 58 management married tertiary no 2143 yes no unknown 5 may 261 1 -1 0 unknown no
1 44 technician single secondary no 29 yes no unknown 5 may 151 1 -1 0 unknown no
2 33 entrepreneur married secondary no 2 yes yes unknown 5 may 76 1 -1 0 unknown no
In [4]:
df = raw.copy()
df = df.reset_index()
In [50]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 18 columns):
age          45211 non-null int64
job          45211 non-null object
marital      45211 non-null object
education    45211 non-null object
default      45211 non-null object
balance      45211 non-null int64
housing      45211 non-null object
loan         45211 non-null object
contact      45211 non-null object
day          45211 non-null int64
month        45211 non-null object
duration     45211 non-null int64
campaign     45211 non-null int64
pdays        45211 non-null int64
previous     45211 non-null int64
poutcome     45211 non-null object
y            45211 non-null object
age_group    45202 non-null category
dtypes: category(1), int64(7), object(10)
memory usage: 5.9+ MB
In [9]:
df.describe()
Out[9]:
age balance day duration campaign pdays previous
count 45211.000000 45211.000000 45211.000000 45211.000000 45211.000000 45211.000000 45211.000000
mean 40.936210 1362.272058 15.806419 258.163080 2.763841 40.197828 0.580323
std 10.618762 3044.765829 8.322476 257.527812 3.098021 100.128746 2.303441
min 18.000000 -8019.000000 1.000000 0.000000 1.000000 -1.000000 0.000000
25% 33.000000 72.000000 8.000000 103.000000 1.000000 -1.000000 0.000000
50% 39.000000 448.000000 16.000000 180.000000 2.000000 -1.000000 0.000000
75% 48.000000 1428.000000 21.000000 319.000000 3.000000 -1.000000 0.000000
max 95.000000 102127.000000 31.000000 4918.000000 63.000000 871.000000 275.000000
In [3]:
age_histogram_distribution = px.histogram(df, x="age", nbins=10,histnorm = 'percent')
age_histogram_distribution.show()
In [5]:
# age_bins = np.arange(10,100,10)
age_bins = [10,30, 40,50,60,96]
#age_group: 10-19; 20-29 etc
df['age_group'] = pd.cut(df['age'], age_bins, include_lowest = True, right = False)
df.head(3)
df['age_group'].value_counts()
Out[5]:
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y age_group
0 58 management married tertiary no 2143 yes no unknown 5 may 261 1 -1 0 unknown no [50, 60)
1 44 technician single secondary no 29 yes no unknown 5 may 151 1 -1 0 unknown no [40, 50)
2 33 entrepreneur married secondary no 2 yes yes unknown 5 may 76 1 -1 0 unknown no [30, 40)
Out[5]:
[30, 40)    18089
[40, 50)    11655
[50, 60)    8410 
[10, 30)    5273 
[60, 96)    1784 
Name: age_group, dtype: int64
In [6]:
sum(df['age_group'].value_counts()) == len(df)
Out[6]:
True
In [8]:
labelencoder = LabelEncoder()
non_numeric_features_df = df.select_dtypes(include = ['object', 'category'])
object_features = df.select_dtypes(include = 'object')
for feature in object_features.columns:
    feature_name_label_encode = ('%s_label_encoded' % feature)
    df[feature_name_label_encode] = labelencoder.fit_transform(df[feature].astype(str))
    df[feature_name_label_encode].value_counts()
    df[feature].value_counts()
Out[8]:
1     9732
4     9458
9     7597
0     5171
7     4154
5     2264
6     1579
2     1487
10    1303
3     1240
8     938 
11    288 
Name: job_label_encoded, dtype: int64
Out[8]:
blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student          938 
unknown          288 
Name: job, dtype: int64
Out[8]:
1    27214
2    12790
0    5207 
Name: marital_label_encoded, dtype: int64
Out[8]:
married     27214
single      12790
divorced    5207 
Name: marital, dtype: int64
Out[8]:
1    23202
2    13301
0    6851 
3    1857 
Name: education_label_encoded, dtype: int64
Out[8]:
secondary    23202
tertiary     13301
primary      6851 
unknown      1857 
Name: education, dtype: int64
Out[8]:
0    44396
1    815  
Name: default_label_encoded, dtype: int64
Out[8]:
no     44396
yes    815  
Name: default, dtype: int64
Out[8]:
1    25130
0    20081
Name: housing_label_encoded, dtype: int64
Out[8]:
yes    25130
no     20081
Name: housing, dtype: int64
Out[8]:
0    37967
1    7244 
Name: loan_label_encoded, dtype: int64
Out[8]:
no     37967
yes    7244 
Name: loan, dtype: int64
Out[8]:
0    29285
2    13020
1    2906 
Name: contact_label_encoded, dtype: int64
Out[8]:
cellular     29285
unknown      13020
telephone    2906 
Name: contact, dtype: int64
Out[8]:
8     13766
5     6895 
1     6247 
6     5341 
9     3970 
0     2932 
3     2649 
4     1403 
10    738  
11    579  
7     477  
2     214  
Name: month_label_encoded, dtype: int64
Out[8]:
may    13766
jul    6895 
aug    6247 
jun    5341 
nov    3970 
apr    2932 
feb    2649 
jan    1403 
oct    738  
sep    579  
mar    477  
dec    214  
Name: month, dtype: int64
Out[8]:
3    36959
0    4901 
1    1840 
2    1511 
Name: poutcome_label_encoded, dtype: int64
Out[8]:
unknown    36959
failure    4901 
other      1840 
success    1511 
Name: poutcome, dtype: int64
Out[8]:
0    39922
1    5289 
Name: y_label_encoded, dtype: int64
Out[8]:
no     39922
yes    5289 
Name: y, dtype: int64
In [9]:
#change month_label_encode
df['month_label_encoded'] = df['month'].map({'jan':1, 
                                           'feb': 2,
                                           'mar': 3,
                                           'apr': 4,
                                           'may': 5,
                                           'jun': 6,
                                           'jul': 7,
                                           'aug': 8,
                                           'sep': 9,
                                           'oct': 10,
                                           'nov': 11,
                                           'dec': 12})
In [58]:
fig = plt.figure(figsize=(12,10))
sns.set()
for i in range(0, len(numeric_features_df.columns), 5):
    ax = sns.pairplot(data=numeric_features_df,
                x_vars=numeric_features_df.columns[i:i+5],
                y_vars=['y_label_encoded'])
plt.tight_layout()  
plt.show();
<Figure size 864x720 with 0 Axes>
In [79]:
avg_yearly_balance_histogram_distribution = px.histogram(df, x="balance", nbins=5,histnorm = 'percent')
avg_yearly_balance_histogram_distribution.show()
In [ ]:
 
In [82]:
duration_histogram_distribution = px.histogram(df, x="duration", nbins=5,histnorm = 'percent')
duration_histogram_distribution.show()
In [97]:
#duration_clipped_below_2000
px.box(df,y="duration")
In [23]:
duration_bins = [0,60,180,300,4919]
df['duration_category'] = pd.cut(df['duration'], duration_bins, include_lowest = True, right = False)
df['duration_category'].value_counts()
sum(df['duration_category'].value_counts()) == len(df)
Out[23]:
[60, 180)      17878
[300, 4919)    12329
[180, 300)     10345
[0, 60)        4659 
Name: duration_category, dtype: int64
Out[23]:
True
In [103]:
px.box(df,y="pdays")
In [17]:
px.histogram(df, x="pdays", nbins=5,histnorm = 'percent')
In [14]:
pdays_bins = [-1,0,180,872]
df['pdays_category'] = pd.cut(df['pdays'], pdays_bins, include_lowest = True, right = False)
sum(df['pdays_category'].value_counts()) == len(df)
len(df.loc[df['pdays'] <0])
Out[14]:
True
Out[14]:
36954
In [89]:
px.box(df,y="balance")
In [11]:
balance_bins = [-8019,0,500,3000,102128]
df['balance_category'] = pd.cut(df['balance'], balance_bins, include_lowest = True, right = False)
df['balance_category'].value_counts()
sum(df['balance_category'].value_counts()) == len(df)
Out[11]:
[0, 500)          19871
[500, 3000)       15959
[3000, 102128)    5615 
[-8019, 0)        3766 
Name: balance_category, dtype: int64
Out[11]:
True
In [6]:
df.head()
Out[6]:
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y age_group job_label_encoded marital_label_encoded education_label_encoded default_label_encoded housing_label_encoded loan_label_encoded contact_label_encoded month_label_encoded poutcome_label_encoded y_label_encoded duration_category balance_category
0 58 management married tertiary no 2143 yes no unknown 5 may 261 1 -1 0 unknown no [50, 60) 4 1 2 0 1 0 2 5 3 0 [180, 300) [500, 3000)
1 44 technician single secondary no 29 yes no unknown 5 may 151 1 -1 0 unknown no [40, 50) 9 2 1 0 1 0 2 5 3 0 [60, 180) [0, 500)
2 33 entrepreneur married secondary no 2 yes yes unknown 5 may 76 1 -1 0 unknown no [30, 40) 2 1 1 0 1 1 2 5 3 0 [60, 180) [0, 500)
3 47 blue-collar married unknown no 1506 yes no unknown 5 may 92 1 -1 0 unknown no [40, 50) 1 1 3 0 1 0 2 5 3 0 [60, 180) [500, 3000)
4 33 unknown single unknown no 1 no no unknown 5 may 198 1 -1 0 unknown no [30, 40) 11 2 3 0 0 0 2 5 3 0 [180, 300) [0, 500)
In [24]:
df['is_second_half_of_month'] = np.where(df['day'].isin(np.arange(16,32)),1,0)
df['is_second_half_of_month'].value_counts()
Out[24]:
1    23268
0    21943
Name: is_second_half_of_month, dtype: int64
In [24]:
px.histogram(df, x="campaign",histnorm = 'percent')
In [26]:
df['campaign'].value_counts()
Out[26]:
1     17544
2     12505
3     5521 
4     3522 
5     1764 
6     1291 
7     735  
8     540  
9     327  
10    266  
11    201  
12    155  
13    133  
14    93   
15    84   
16    79   
17    69   
18    51   
19    44   
20    43   
21    35   
22    23   
23    22   
25    22   
24    20   
28    16   
29    16   
26    13   
31    12   
27    10   
32    9    
30    8    
33    6    
34    5    
36    4    
35    4    
43    3    
38    3    
41    2    
50    2    
37    2    
51    1    
55    1    
46    1    
58    1    
44    1    
39    1    
63    1    
Name: campaign, dtype: int64
In [12]:
current_engagement_bins = [1,6,10,64]
df['current_engagement_category'] = pd.cut(df['campaign'], current_engagement_bins, include_lowest = True, right = False)
df['current_engagement_category'].value_counts()
sum(df['current_engagement_category'].value_counts()) == len(df)
Out[12]:
[1, 6)      40856
[6, 10)     2893 
[10, 64)    1462 
Name: current_engagement_category, dtype: int64
Out[12]:
True
In [25]:
px.histogram(df, x="previous",histnorm = 'percent')
In [27]:
df['previous'].value_counts()
Out[27]:
0      36954
1      2772 
2      2106 
3      1142 
4      714  
5      459  
6      277  
7      205  
8      129  
9      92   
10     67   
11     65   
12     44   
13     38   
15     20   
14     19   
17     15   
16     13   
19     11   
23     8    
20     8    
22     6    
18     6    
24     5    
27     5    
29     4    
25     4    
21     4    
30     3    
28     2    
26     2    
37     2    
38     2    
55     1    
40     1    
35     1    
58     1    
51     1    
41     1    
32     1    
275    1    
Name: previous, dtype: int64
In [19]:
prev_engagement_bins = [0,1,6,10,276]
df['prev_engagement_category'] = pd.cut(df['previous'], prev_engagement_bins, include_lowest = True, right = False)
df['prev_engagement_category'].value_counts()

sum(df['prev_engagement_category'].value_counts()) == len(df)

# [0, 1)       # 0 prior engagement
# [1, 6)       #low frequency
# [6, 10)      #mid  
# [10, 276)    #high  
df.pivot_table(index = 'prev_engagement_category', columns = ['poutcome','y'], values = 'index', aggfunc = len)
Out[19]:
[0, 1)       36954
[1, 6)       7193 
[6, 10)      703  
[10, 276)    361  
Name: prev_engagement_category, dtype: int64
Out[19]:
True
Out[19]:
poutcome failure other success unknown
y no yes no yes no yes no yes
prev_engagement_category
[0, 1) nan nan nan nan nan nan 33570.00 3384.00
[1, 6) 3853.00 542.00 1248.00 243.00 465.00 838.00 2.00 2.00
[6, 10) 296.00 54.00 157.00 39.00 49.00 107.00 1.00 nan
[10, 276) 134.00 22.00 128.00 25.00 19.00 33.00 nan nan
In [30]:
df['total_duration'] = df['campaign'] * df['duration']
px.histogram(df, x="total_duration",histnorm = 'percent')
In [31]:
px.box(df,y="total_duration")
In [22]:
total_duration_bins = [0,180,600,60171]
df['total_duration_category'] = pd.cut(df['total_duration'], total_duration_bins, include_lowest = True, right = False)
sum(df['total_duration_category'].value_counts()) == len(df)
Out[22]:
True
In [43]:
df.pivot_table(index = 'total_duration_category', columns = ['poutcome','y'], values = 'index', aggfunc = len)
Out[43]:
poutcome failure other success unknown
y no yes no yes no yes no yes
total_duration_category
[0, 180) 1525 55 513 22 156 96 8936 245
[180, 600) 1960 287 654 127 273 552 15173 1040
[600, 60170) 798 276 366 158 104 330 9463 2101
In [25]:
df['new_customers'] = np.where((df['pdays']==-1),1,0)
df['new_customers'].value_counts()
Out[25]:
1    36954
0    8257 
Name: new_customers, dtype: int64
In [49]:
# df = df.reset_index()
df.head(2)
Out[49]:
index age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y age_group job_label_encoded marital_label_encoded education_label_encoded default_label_encoded housing_label_encoded loan_label_encoded contact_label_encoded month_label_encoded poutcome_label_encoded y_label_encoded duration_category balance_category is_second_half_of_month pdays_category new_customers total_duration total_duration_category prev_engagement_category current_engagement_category
0 0 58 management married tertiary no 2143 yes no unknown 5 may 261 1 -1 0 unknown no [50, 60) 4 1 2 0 1 0 2 5 3 0 [180, 300) [500, 3000) 0 [-1, 0) 1 261 [180, 600) [0, 1) [1, 6)
1 1 44 technician single secondary no 29 yes no unknown 5 may 151 1 -1 0 unknown no [40, 50) 9 2 1 0 1 0 2 5 3 0 [60, 180) [0, 500) 0 [-1, 0) 1 151 [0, 180) [0, 1) [1, 6)
In [26]:
df.isnull().sum() 
Out[26]:
index                          0
age                            0
job                            0
marital                        0
education                      0
default                        0
balance                        0
housing                        0
loan                           0
contact                        0
day                            0
month                          0
duration                       0
campaign                       0
pdays                          0
previous                       0
poutcome                       0
y                              0
age_group                      0
job_label_encoded              0
marital_label_encoded          0
education_label_encoded        0
default_label_encoded          0
housing_label_encoded          0
loan_label_encoded             0
contact_label_encoded          0
month_label_encoded            0
poutcome_label_encoded         0
y_label_encoded                0
balance_category               0
current_engagement_category    0
pdays_category                 0
prev_engagement_category       0
total_duration                 0
total_duration_category        0
duration_category              0
is_second_half_of_month        0
new_customers                  0
dtype: int64
In [27]:
df.shape
Out[27]:
(45211, 38)
In [28]:
with open('../data/interim/df.pkl', 'wb') as to_write: #pre-processed df
    pickle.dump(df, to_write) 

checkpoint!

In [2]:
with open('../data/interim/df.pkl', 'rb') as read_file:
    df = pickle.load(read_file)

Since observations are ordered by date (from May 2008 to November 2010), use this to impute year

In [44]:
df.loc[(df['day']==31) & (df['month_label_encoded']==5)].tail(1)
# .index.values[0]
Out[44]:
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y age_group job_label_encoded marital_label_encoded education_label_encoded default_label_encoded housing_label_encoded loan_label_encoded contact_label_encoded month_label_encoded poutcome_label_encoded y_label_encoded duration_category balance_category
43837 41 entrepreneur married tertiary no 70 no no unknown 31 may 14 1 557 1 failure no [40, 50) 2 1 2 0 0 0 2 5 0 0 [0, 104) [-1945, 73)
In [45]:
df.iloc[df.loc[(df['day']==31) & (df['month_label_encoded']==5)].tail(1).index.values[0]:
        df.loc[(df['day']==31) & (df['month_label_encoded']==5)].tail(1).index.values[0]+2,:]
Out[45]:
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y age_group job_label_encoded marital_label_encoded education_label_encoded default_label_encoded housing_label_encoded loan_label_encoded contact_label_encoded month_label_encoded poutcome_label_encoded y_label_encoded duration_category balance_category
43837 41 entrepreneur married tertiary no 70 no no unknown 31 may 14 1 557 1 failure no [40, 50) 2 1 2 0 0 0 2 5 0 0 [0, 104) [-1945, 73)
43838 55 admin. married secondary no 8304 no no cellular 1 jun 201 1 -1 0 unknown yes [50, 60) 0 1 1 0 0 0 0 6 3 1 [181, 319) [3462, 10212)
In [11]:
df.iloc[28904:28906,:]
Out[11]:
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y age_group job_label_encoded marital_label_encoded education_label_encoded default_label_encoded housing_label_encoded loan_label_encoded contact_label_encoded month_label_encoded poutcome_label_encoded y_label_encoded duration_category balance_category
28904 26 services single secondary no 110 no no cellular 30 jan 1136 2 -1 0 unknown yes [20, 30) 7 2 1 0 0 0 0 1 3 1 [640, 4918) [73, 449)
28905 31 services single secondary no 295 no no cellular 2 feb 78 5 -1 0 unknown no [30, 40) 7 2 1 0 0 0 0 2 3 0 [0, 104) [73, 449)
In [26]:
sorted(unique_months)
sample['month_label_encoded'].unique()
Out[26]:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
Out[26]:
array([5])
In [35]:
end_of_the_month_index = []
unique_months = df['month_label_encoded'].unique().tolist()
sample = df.head(1000)
sample.head(2)
for month in unique_months:
    end_of_the_month_index.append(sample.loc[(sample['day'].isin([25,26,27,28,29,30,31])) & (sample['month_label_encoded']==month)].tail(1).index.values[0])
print(end_of_the_month_index)
Out[35]:
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y age_group job_label_encoded marital_label_encoded education_label_encoded default_label_encoded housing_label_encoded loan_label_encoded contact_label_encoded month_label_encoded poutcome_label_encoded y_label_encoded duration_category balance_category
0 58 management married tertiary no 2143 yes no unknown 5 may 261 1 -1 0 unknown no [50, 60) 4 1 2 0 1 0 2 5 3 0 [181, 319) [1429, 3462)
1 44 technician single secondary no 29 yes no unknown 5 may 151 1 -1 0 unknown no [40, 50) 9 2 1 0 1 0 2 5 3 0 [104, 181) [-1945, 73)
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-35-f04e726f88a6> in <module>
      4 sample.head(2)
      5 for month in unique_months:
----> 6     end_of_the_month_index.append(sample.loc[(sample['day'].isin([25,26,27,28,29,30,31])) & (sample['month_label_encoded']==month)].tail(1).index.values[0])
      7 print(end_of_the_month_index)

IndexError: index 0 is out of bounds for axis 0 with size 0
In [41]:
day = np.arange(14,32)
sample.loc[(sample['day'].isin(day)) & (sample['month_label_encoded']==5)].tail(1)
Out[41]:
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y age_group job_label_encoded marital_label_encoded education_label_encoded default_label_encoded housing_label_encoded loan_label_encoded contact_label_encoded month_label_encoded poutcome_label_encoded y_label_encoded duration_category balance_category
In [20]:
df.head(1)
Out[20]:
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y age_group job_label_encoded marital_label_encoded education_label_encoded default_label_encoded housing_label_encoded loan_label_encoded contact_label_encoded month_label_encoded poutcome_label_encoded y_label_encoded duration_category balance_category
0 58 management married tertiary no 2143 yes no unknown 5 may 261 1 -1 0 unknown no [50, 60) 4 1 2 0 1 0 2 5 3 0 [181, 319) [1429, 3462)
In [53]:
df['balance_category'].nunique()
df['duration_category'].nunique()
Out[53]:
6
Out[53]:
5
In [111]:
df['num_contacts_current_campaign'] = df['campaign'] - df['previous']
In [47]:
contacts_agg_time = df.groupby(['month_label_encoded', 'day'])['campaign'].sum().reset_index(name="num_of_contacts")
contacts_agg_time = contacts_agg_time.copy()
contacts_agg_time['date'] = contacts_agg_time['day'].astype(str) + '-' + contacts_agg_time['month_label_encoded'].astype(str) + '-' + '2010'
contacts_agg_time['date'] = pd.to_datetime(contacts_agg_time['date'], dayfirst=True)
# contacts_agg_time['date'] = contacts_agg_time['date'].apply(lambda x: dt.datetime.strftime(x, '%Y-%m-%d'))
contacts_agg_time.head()
# contacts_agg_time.info()
Out[47]:
month_label_encoded day num_of_contacts date
0 1 6 2 2010-01-06
1 1 7 4 2010-01-07
2 1 8 7 2010-01-08
3 1 11 20 2010-01-11
4 1 12 32 2010-01-12
In [49]:
contacts_agg_time_fig = px.line(contacts_agg_time, x="date", y="num_of_contacts")
contacts_agg_time_fig.show()
In [131]:
contacts_agg_day = df.groupby('day')['campaign'].sum().reset_index(name="num_of_contacts_per_day")
contacts_agg_day.head()
Out[131]:
day num_of_contacts_per_day
0 1 517
1 2 2836
2 3 2808
3 4 3211
4 5 4018
In [132]:
contacts_agg_day_fig = px.line(contacts_agg_day, x="day", y="num_of_contacts_per_day")
contacts_agg_day_fig.show()